# Using beautiful soup to scrape data
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
# use geocoder library, if not present use !conda install -c conda-forge geocoder
import geocoder
# Google API key is required for the geocoder library to work, save the API key in OS environment variables as GOOGLE_API_KEY
# and then access thay key here
import os
# Use BING_API_KEY when choosing to use bing geocoding instead of google geocoding.
BING_API_KEY = 'AksNN-3luSfNBssyZ3Ju4i78nIrFLt1UtYo--YWQj9oyfxSwyXkdsqykWk3FeTXB' # os.environ['BING_API_KEY']
# This function will take an adress and return the latlng of that adress
def get_latlng(address):
# using bing geocoder API since it is better.
g = geocoder.bing(address, key = BING_API_KEY)
return pd.Series(g.latlng)
# Function returns a soup object on the basis of URL
def get_soup_object(url):
source_data = requests.get(url).text
return BeautifulSoup(source_data,'lxml')
# initialize url
rnc_data_url = 'http://vlist.in/district/364.html'
# use function to get soup object
soup = get_soup_object(rnc_data_url)
print('Soup object created')
village_url_header = 'http://vlist.in'
district_name = 'Ranchi'
# function extracts row from the table from government website. This will return the name in the table and the link associated with the name
def extract_row(table_row):
table_row = table_row.find_all('td')
index = table_row[0].text
link = village_url_header + table_row[1].find('a')['href']
name = table_row[1].text
return link, name
# extracting the block rows
table_rows = soup.find_all('tr')
table_rows = table_rows[1:]
table_rows = table_rows[1:]
data = []
# for every block row all the villages will also be extracted
for table_row in table_rows:
sub_district_link, block_name = extract_row(table_row)
print(block_name)
# getting the sub villages in block
soup_village = get_soup_object(sub_district_link)
# get all the table rows for individual villages in block
sub_table_rows = soup_village.find_all('tr')
sub_table_rows = sub_table_rows[1:]
# extract individual village name and store it in data along with block name and district name
for sub_table_row in sub_table_rows:
sub_link, village_name = extract_row(sub_table_row)
data.append([village_name, block_name, district_name])
print(data[0])
# save data in csv for future usage
header = ['Village','Block','District']
df = pd.DataFrame(data= data, columns= header)
df.head()
# using the get_latlng function to define latitude and longitude columns of the data frame
df[['Latitude','Longitude']] = df.apply(lambda x: get_latlng(x.Village +', '+ x.Block + ', ' + x.District), axis=1)
df.head()
df.info()
df.dropna(inplace= True)
df.info()
# data will be used later
df.to_csv('ranchi_villages.csv')
delhi_data_url = 'https://en.wikipedia.org/wiki/Neighbourhoods_of_Delhi'
# initialize soup object
soup = get_soup_object(delhi_data_url)
print('soup object created')
# get the relevant rows
row_groups = soup.find_all('ul')
row_groups = row_groups[1:10]
row_items = []
for row_group in row_groups:
rows = row_group.find_all('li')
for row in rows:
row_items.append([row.text,'Delhi'])
# print the number of neighborhood obtained
print(len(row_items))
# create a data frame
header = ['Neighborhood','City']
df = pd.DataFrame(data= row_items, columns= header)
df.tail()
# using the get_latlng function to define latitude and longitude columns of the data frame
df[['Latitude','Longitude']] = df.apply(lambda x: get_latlng(x.Neighborhood + ', ' + x.City), axis=1)
df.head()
df.info()
df.to_csv('delhi_subdiv.csv')
# Initialize the url
chennai_data_url = 'https://en.wikipedia.org/wiki/List_of_neighbourhoods_of_Chennai'
# initialize soup object
soup = get_soup_object(chennai_data_url)
print('soup object created')
# get the relevant rows
row_groups = soup.find_all('ul')
row_groups = row_groups[1:8]
row_items = []
for row_group in row_groups:
rows = row_group.find_all('li')
for row in rows:
row_items.append([row.text,'Chennai'])
# print the number of neighborhood obtained
print(len(row_items))
# save data in csv for future usage
header = ['Neighborhood','City']
df = pd.DataFrame(data= row_items, columns= header)
df.head()
# using the get_latlng function to define latitude and longitude columns of the data frame
df[['Latitude','Longitude']] = df.apply(lambda x: get_latlng(x.Neighborhood + ', ' + x.City), axis=1)
df.head()
df.info()
df.to_csv('chennai_subdiv.csv')
# Initialize the url
kolkata_data_url = 'https://en.wikipedia.org/wiki/Neighbourhoods_in_Kolkata_Metropolitan_Area'
# initialize soup object
soup = get_soup_object(kolkata_data_url)
print('soup object created')
# get the relevant rows
row_groups = soup.find_all('ul')
row_groups = row_groups[1:7]
row_items = []
for row_group in row_groups:
rows = row_group.find_all('li')
for row in rows:
row_items.append([row.text,'Kolkata'])
print(len(row_items))
# save data in csv for future usage
header = ['Neighborhood','City']
df = pd.DataFrame(data= row_items, columns= header)
df.head()
# using the get_latlng function to define latitude and longitude columns of the data frame
df[['Latitude','Longitude']] = df.apply(lambda x: get_latlng(x.Neighborhood + ', ' + x.City), axis=1)
df.head()
df.info()
df.to_csv('kolkata_subdiv.csv')
df = pd.read_csv('kolkata_subdiv.csv',index_col = 0)
df.head()
# Initialize the url
mumbai_data_url = 'https://en.wikipedia.org/wiki/List_of_neighbourhoods_in_Mumbai'
# initialize soup object
soup = get_soup_object(mumbai_data_url)
print('soup object created')
# get the relevant rows
row_groups = soup.find_all('ul')
row_groups = row_groups[5:36]
row_items = []
for row_group in row_groups:
rows = row_group.find_all('li')
for row in rows:
row_items.append([row.text,'Mumbai'])
print(len(row_items))
# save data in csv for future usage
header = ['Neighborhood','City']
df = pd.DataFrame(data= row_items, columns= header)
df.head()
# using the get_latlng function to define latitude and longitude columns of the data frame
df[['Latitude','Longitude']] = df.apply(lambda x: get_latlng(x.Neighborhood + ', ' + x.City), axis=1)
df.head()
df.info()
df.to_csv('mumbai_subdiv.csv')
#!conda install -c conda-forge folium --yes # uncomment this line if folium is missing
import folium
# Function takes in a data frame with Latitude, Longitude, Neighborhood and City columns and shows it on map
def visualize_area_in_map(data):
# add markers to map
for lat, lng, neighborhood, city in zip(data['Latitude'], data['Longitude'], data['Neighborhood'], data['City']):
label = '{}, {}'.format(neighborhood, city)
label = folium.Popup(label, parse_html=True)
folium.CircleMarker(
[lat, lng],
radius=2,
popup=label,
color='blue',
fill=True,
fill_color='#3186cc',
fill_opacity=0.7,
parse_html=False).add_to(map)
return map
city = 'Mumbai'
latitude, longitude = get_latlng(city)
print('Lat : ',latitude,' Long : ',longitude)
# create map of Toronto using latitude and longitude values
map = folium.Map(location=[latitude, longitude], zoom_start=10)
# data to be used for map
data = df.dropna()
visualize_area_in_map(data)